/* Copyright 2025 The xLLM Authors. All Rights Reserved.
Copyright 2024 The ScaleLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#pragma once

#include <limits>
#include <optional>
#include <sstream>
#include <string>
#include <vector>

#include "common/macros.h"
#include "common/types.h"

namespace xllm {

class Options {
 public:
  Options() = default;
  ~Options() = default;

  std::string to_string() const;

 private:
  PROPERTY(std::string, model_path);

  PROPERTY(std::string, model_id);

  PROPERTY(std::optional<std::string>, devices);

  PROPERTY(std::optional<std::string>, draft_model_path);

  PROPERTY(std::optional<std::string>, draft_devices);

  // model backend
  PROPERTY(std::string, backend);

  // max image num per prompt, default 4
  PROPERTY(int32_t, limit_image_per_prompt) = 4;

  // block size, default 128
  PROPERTY(int32_t, block_size) = 128;

  // the maximum cache size in bytes, default is 0 which means cache size is
  // caculated by available memory * max_memory_utilization
  PROPERTY(int64_t, max_cache_size) = 0;

  // maximum memory utilization allowed, default 0.9
  PROPERTY(double, max_memory_utilization) = 0.9;

  PROPERTY(bool, enable_prefix_cache) = true;

  // max tokens num per batch
  PROPERTY(int32_t, max_tokens_per_batch) = 20480;

  // max sequences num per batch
  PROPERTY(int32_t, max_seqs_per_batch) = 256;

  // the max tokens per chunk for request in prefill stage.
  PROPERTY(int32_t, max_tokens_per_chunk_for_prefill);

  // sps tokens
  PROPERTY(int32_t, num_speculative_tokens) = 0;

  // thread num to handle requests
  PROPERTY(size_t, num_request_handling_threads) = 4;

  PROPERTY(std::optional<bool>, enable_eplb);

  PROPERTY(std::optional<int32_t>, redundant_experts_num);

  PROPERTY(std::optional<int64_t>, eplb_update_interval);

  PROPERTY(std::optional<double>, eplb_update_threshold);

  PROPERTY(std::optional<std::string>, communication_backend);

  PROPERTY(std::optional<std::string>, rank_tablefile);

  PROPERTY(std::optional<int32_t>, expert_parallel_degree);

  PROPERTY(std::string, task_type);

  PROPERTY(bool, enable_mla) = false;

  PROPERTY(bool, enable_chunked_prefill) = true;

  PROPERTY(std::optional<std::string>, master_node_addr);

  PROPERTY(int32_t, nnodes) = 1;

  PROPERTY(int32_t, node_rank) = 0;

  PROPERTY(int32_t, dp_size) = 1;

  PROPERTY(int32_t, ep_size) = 1;

  PROPERTY(std::optional<std::string>, xservice_addr);

  PROPERTY(std::optional<std::string>, instance_name);

  PROPERTY(bool, enable_disagg_pd) = false;

  PROPERTY(bool, enable_pd_ooc) = false;

  PROPERTY(bool, enable_schedule_overlap) = true;

  PROPERTY(InstanceRole, instance_role) = InstanceRole::DEFAULT;

  PROPERTY(std::string, kv_cache_transfer_mode) = "PUSH";

  PROPERTY(std::optional<std::string>, device_ip);

  PROPERTY(uint16_t, transfer_listen_port) = 26000;

  PROPERTY(std::optional<std::string>, etcd_addr);

  PROPERTY(bool, enable_service_routing) = false;

  PROPERTY(std::optional<std::string>, tool_call_parser);

  PROPERTY(std::optional<std::string>, reasoning_parser);

  PROPERTY(std::string, priority_strategy) = "FCFS";

  PROPERTY(bool, enable_online_preempt_offline) = true;

  PROPERTY(double, host_blocks_factor) = 0.0;

  PROPERTY(bool, enable_kvcache_store) = false;

  PROPERTY(bool, enable_cache_upload) = false;

  PROPERTY(std::string, store_protocol) = "tcp";

  PROPERTY(std::string, store_master_server_address) = "";

  PROPERTY(std::string, store_metadata_server) = "";

  PROPERTY(std::string, store_local_hostname) = "";

  PROPERTY(bool, enable_multi_stream_parallel) = false;

  PROPERTY(bool, enable_profile_step_time) = false;

  PROPERTY(bool, enable_profile_token_budget) = false;

  PROPERTY(bool, enable_latency_aware_schedule) = false;
  // the max prompt length for profile
  PROPERTY(int32_t, profile_max_prompt_length) = 2048;
  // true if generate kv cache for profile
  PROPERTY(bool, enable_profile_kv_blocks) = true;
  // true if disable ttft profiling
  PROPERTY(bool, disable_ttft_profiling) = false;
  // true if enable forward interruption
  PROPERTY(bool, enable_forward_interruption) = false;
  // all requests use single global ttft
  PROPERTY(int32_t, max_global_ttft_ms) = std::numeric_limits<int32_t>::max();
  // all requests use single global tpot
  PROPERTY(int32_t, max_global_tpot_ms) = std::numeric_limits<int32_t>::max();

  // dit
  // max requests per batch
  PROPERTY(int, max_requests_per_batch) = 0;

  PROPERTY(bool, enable_continuous_kvcache) = false;

  // for offline inference: start with offline inference, default is false
  PROPERTY(bool, enable_offline_inference) = false;
  // for offline inference: the path to spawn worker binary
  PROPERTY(std::string, spawn_worker_path) = "";

  // use shared memory for inter-process communication in the single-machine
  // multi-GPU scenario.
  PROPERTY(bool, enable_shm) = false;

  // whether the worker and master are on the same machine.
  PROPERTY(bool, is_local) = false;
};

}  // namespace xllm
